import os 
import json
import numpy as np
import pandas as pd
import dill as pickle
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings("ignore")


def build_and_train():

	data = pd.read_csv('../data/training.csv')
	data = data.dropna(subset=['Gender', 'Married', 'Credit_History', 'LoanAmount'])

	pred_var = ['Gender','Married','Dependents','Education','Self_Employed',\
                'ApplicantIncome','CoapplicantIncome',\
				'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

	X_train, X_test, y_train, y_test = train_test_split(data[pred_var],
                                                        data['Loan_Status'],
														test_size=0.25,
                                                        random_state=42)
	y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()
	y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()

	pipe = make_pipeline(PreProcessing(),RandomForestClassifier())

	param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
				 "randomforestclassifier__max_depth" : [None, 6, 8, 10],
				 "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
				 "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

	grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)

	grid.fit(X_train, y_train)

	return(grid)


class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom Pre-Processing estimator for our use-case
    """
    def __init__(self):
        pass

    def transform(self, df):

        pred_var = ['Gender','Married','Dependents','Education','Self_Employed',\
                    'ApplicantIncome','CoapplicantIncome','LoanAmount',\
                    'Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
        gender_values = {'Female' : 0, 'Male' : 1} 
        married_values = {'No' : 0, 'Yes' : 1}
        education_values = {'Graduate' : 0, 'Not Graduate' : 1}
        employed_values = {'No' : 0, 'Yes' : 1}
        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
        df.replace(
            {
                'Gender': gender_values,
                'Married': married_values,
                'Education': education_values,
                'Self_Employed': employed_values,
                'Property_Area': property_values,
                'Dependents': dependent_values
             }, inplace=True
        )
        
        return df.as_matrix()

    def fit(self, df, y=None, **fit_params):
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()
        return self

if __name__ == '__main__':
    model = build_and_train()
    filename = 'model_v2.pk'
    with open('../flask_api/models/'+filename, 'wb') as file:
        pickle.dump(model, file)
